{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fd1a39e2",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install torch==2.2\n",
    "!pip install huggingface_hub==0.20.3\n",
    "!pip install transformers==4.37.2\n",
    "!pip install datasets==2.17.0\n",
    "!pip install numpy==1.26.4"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8f8eb2a9-d2d8-44df-b98c-bc1f401ac372",
   "metadata": {},
   "outputs": [],
   "source": [
    "from huggingface_hub import hf_api\n",
    "datasets = hf_api.list_datasets()\n",
    "len([d for d in datasets])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "b662538f-c500-4711-a346-4aa084ad7244",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default\n",
      "Reusing dataset rotten_tomatoes (/Users/ashish.jha/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "1c49a19ccd6f48329ddf591fd5bd91b6",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/3 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c5893ab9a1e34d469f0744b02c10f2d8",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/9 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "6e498af0ba34485a95d12ea4b2376411",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b557eb2a4bd840d59e75f9b89605f6ba",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import torch\n",
    "from datasets import load_dataset\n",
    "from transformers import BertTokenizer\n",
    "\n",
    "# Loading a dataset from HuggingFace Datasets library\n",
    "dataset = load_dataset(\"rotten_tomatoes\")\n",
    "\n",
    "# Initializing a tokenizer\n",
    "tokenizer = BertTokenizer.from_pretrained(\"bert-base-uncased\")\n",
    "\n",
    "# Tokenizing and preparing the dataset for PyTorch\n",
    "def tokenize_function(example):\n",
    "    return tokenizer(example[\"text\"], padding=\"max_length\", truncation=True)\n",
    "\n",
    "tokenized_dataset = dataset.map(tokenize_function, batched=True)\n",
    "tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
    "\n",
    "# Creating PyTorch DataLoader\n",
    "train_dataloader = torch.utils.data.DataLoader(tokenized_dataset['train'], batch_size=8, shuffle=True)\n",
    "eval_dataloader = torch.utils.data.DataLoader(tokenized_dataset['test'], batch_size=8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "29b5af5e-7fc3-48e0-9c63-134fb550d778",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1067/1067 [4:03:15<00:00, 13.68s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 1 - Evaluation Accuracy: 0.800187617260788\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1067/1067 [4:10:06<00:00, 14.06s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 2 - Evaluation Accuracy: 0.8292682926829268\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1067/1067 [4:15:38<00:00, 14.38s/it]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Epoch 3 - Evaluation Accuracy: 0.8461538461538461\n"
     ]
    }
   ],
   "source": [
    "from tqdm import tqdm\n",
    "from transformers import BertForSequenceClassification, AdamW\n",
    "\n",
    "# Load pre-trained BERT model for sequence classification\n",
    "model = BertForSequenceClassification.from_pretrained(\"bert-base-uncased\")\n",
    "\n",
    "# Optimizer and learning rate scheduler setup\n",
    "optimizer = AdamW(model.parameters(), lr=5e-5)\n",
    "\n",
    "# Training loop using PyTorch\n",
    "for epoch in range(3):  # Train for 3 epochs as an example\n",
    "    model.train()\n",
    "    for batch in tqdm(train_dataloader):\n",
    "        optimizer.zero_grad()\n",
    "        input_ids = batch['input_ids']\n",
    "        attention_mask = batch['attention_mask']\n",
    "        labels = batch['label']\n",
    "\n",
    "        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)\n",
    "        loss = outputs.loss\n",
    "        loss.backward()\n",
    "        optimizer.step()\n",
    "\n",
    "    # Evaluation loop\n",
    "    model.eval()\n",
    "    total_correct = 0\n",
    "    total_samples = 0\n",
    "    for batch in tqdm(eval_dataloader):\n",
    "        with torch.no_grad():\n",
    "            input_ids = batch['input_ids']\n",
    "            attention_mask = batch['attention_mask']\n",
    "            labels = batch['label']\n",
    "\n",
    "            outputs = model(input_ids=input_ids, attention_mask=attention_mask)\n",
    "            predictions = torch.argmax(outputs.logits, dim=1)\n",
    "\n",
    "            total_correct += (predictions == labels).sum().item()\n",
    "            total_samples += len(labels)\n",
    "\n",
    "    accuracy = total_correct / total_samples\n",
    "    print(f\"Epoch {epoch + 1} - Evaluation Accuracy: {accuracy}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f15d030f-dae8-4403-a505-658af0cec891",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93962529-a401-4611-aa1f-8b5a63cba99f",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (Local)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
